2017년 3,4분기의 데이터를 id별로 sort 시킨 후, 중복행은 제거하였다.
stations <- read_csv("Divvy_Stations_2017_Q3Q4.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## name = col_character(),
## city = col_character(),
## latitude = col_double(),
## longitude = col_double(),
## dpcapacity = col_double(),
## online_date = col_character(),
## X8 = col_logical()
## )
stations <- stations %>% arrange(id)
stations <- unique(stations[,-7])
stations
## # A tibble: 585 x 7
## id name city latitude longitude dpcapacity X8
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <lgl>
## 1 2 Buckingham Fountain Chica~ 41.9 -87.6 27 NA
## 2 3 Shedd Aquarium Chica~ 41.9 -87.6 55 NA
## 3 4 Burnham Harbor Chica~ 41.9 -87.6 23 NA
## 4 5 State St & Harrison St Chica~ 41.9 -87.6 23 NA
## 5 6 Dusable Harbor Chica~ 41.9 -87.6 39 NA
## 6 7 Field Blvd & South Wa~ Chica~ 41.9 -87.6 19 NA
## 7 9 Leavitt St & Archer A~ Chica~ 41.8 -87.7 19 NA
## 8 11 Jeffery Blvd & 71st St Chica~ 41.8 -87.6 11 NA
## 9 12 South Shore Dr & 71st~ Chica~ 41.8 -87.6 15 NA
## 10 13 Wilton Ave & Diversey~ Chica~ 41.9 -87.7 27 NA
## # ... with 575 more rows
leaflet(data = stations) %>% addTiles() %>%
addCircleMarkers(lat = ~latitude,lng = ~longitude,
popup=paste(stations$name,'<br>','Depacity :',stations$dpcapacity),
opacity = 0.6, radius = stations$dpcapacity/3) # 반지름은 남은 자전거 수에 비례
ggplot(stations, aes(x=dpcapacity)) +
geom_density(alpha=.4, fill="#FF6666",color="#FF6666") +
labs(fill="",title="Density Plot : Number of total docks at each station") +
theme_fivethirtyeight()
summary(stations$dpcapacity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 15.00 15.00 17.46 19.00 55.00
# map에 시각화
leaflet(data = stations %>% filter(dpcapacity>30)) %>% addTiles() %>%
addMarkers(lat = ~latitude,lng = ~longitude,
popup=paste(stations$name,'<br>','Depacity :',stations$dpcapacity))
2017년 3,4분기의 자전거 사용 데이터를 이용하였다. 데이터의 형태와 열에 대한 결측치 수는 다음과 같다. usertype은 성별과 생년월일 값이 존재하지 않은 Customer과 Dependent를 제외한 Subscriber을 사용하였다.
Trips_Q3 <- read_csv("Divvy_Trips_2017_Q3.csv")
## Parsed with column specification:
## cols(
## trip_id = col_double(),
## start_time = col_character(),
## end_time = col_character(),
## bikeid = col_double(),
## tripduration = col_double(),
## from_station_id = col_double(),
## from_station_name = col_character(),
## to_station_id = col_double(),
## to_station_name = col_character(),
## usertype = col_character(),
## gender = col_character(),
## birthyear = col_double()
## )
Trips_Q4 <- read_csv("Divvy_Trips_2017_Q4.csv")
## Parsed with column specification:
## cols(
## trip_id = col_double(),
## start_time = col_character(),
## end_time = col_character(),
## bikeid = col_double(),
## tripduration = col_double(),
## from_station_id = col_double(),
## from_station_name = col_character(),
## to_station_id = col_double(),
## to_station_name = col_character(),
## usertype = col_character(),
## gender = col_character(),
## birthyear = col_double()
## )
trips <- rbind(Trips_Q3,Trips_Q4)
rm(Trips_Q3,Trips_Q4)
table(trips$usertype)
##
## Customer Dependent Subscriber
## 519710 3 1757796
colSums(is.na(trips))
## trip_id start_time end_time bikeid
## 0 0 0 0
## tripduration from_station_id from_station_name to_station_id
## 0 0 0 0
## to_station_name usertype gender birthyear
## 0 0 519960 520075
trips <- trips %>% filter(usertype=='Subscriber')
trips
## # A tibble: 1,757,796 x 12
## trip_id start_time end_time bikeid tripduration from_station_id
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1.67e7 9/30/2017~ 10/1/20~ 1411 349 216
## 2 1.67e7 9/30/2017~ 10/1/20~ 3048 354 216
## 3 1.67e7 9/30/2017~ 10/1/20~ 2590 226 141
## 4 1.67e7 9/30/2017~ 10/1/20~ 1287 530 96
## 5 1.67e7 9/30/2017~ 10/1/20~ 6132 1072 478
## 6 1.67e7 9/30/2017~ 10/1/20~ 5235 497 114
## 7 1.67e7 9/30/2017~ 10/1/20~ 54 214 87
## 8 1.67e7 9/30/2017~ 10/1/20~ 5794 1072 296
## 9 1.67e7 9/30/2017~ 10/1/20~ 2730 1080 296
## 10 1.67e7 9/30/2017~ 10/1/20~ 6293 425 334
## # ... with 1,757,786 more rows, and 6 more variables:
## # from_station_name <chr>, to_station_id <dbl>, to_station_name <chr>,
## # usertype <chr>, gender <chr>, birthyear <dbl>
start.id <- sort(table(trips$from_station_id), decreasing=TRUE)
end.id <- sort(table(trips$to_station_id), decreasing=TRUE)
n = 10
top_n <- data.frame(start.id=paste0(names(start.id[1:n]),"station"),
start.freq=as.vector(start.id[1:n]),
end.id=paste0(names(end.id[1:n]),"station"),
end.freq=as.vector(end.id[1:n]))
gg1 <- ggplot(top_n, aes(x=reorder(start.id,-start.freq),y=start.freq,fill=start.freq)) +
geom_bar(stat="identity") +
labs(fill="",title="Frequency of Where trip originated") +
theme_fivethirtyeight() + theme(legend.position = "none")
gg2 <- ggplot(top_n, aes(x=reorder(end.id,-end.freq),y=end.freq,fill=end.freq)) +
geom_bar(stat="identity") +
labs(fill="",title="Frequency of Where trip terminated") +
theme_fivethirtyeight() + theme(legend.position = "none")
ggplotly(gg1)
ggplotly(gg2)
trips <- trips %>% mutate(minutes = tripduration/60,
hours = tripduration/60/60,
age = 2019-birthyear+1)
trips$age_bin <- trips$age %>% .bincode(seq(0,120,20))
trips$age_bin <- sapply(trips$age_bin,function(bin) {
return(paste0((bin-1)*20,"-",(bin*20)," Years Old"))
})
table(trips$age_bin) %>% lapply({
. %>% format(big.mark=",") %>% return
})
## $`0-20 Years Old`
## [1] "1,611"
##
## $`100-120 Years Old`
## [1] "620"
##
## $`20-40 Years Old`
## [1] "1,223,049"
##
## $`40-60 Years Old`
## [1] "450,044"
##
## $`60-80 Years Old`
## [1] "80,825"
##
## $`80-100 Years Old`
## [1] "877"
##
## $`NA-NA Years Old`
## [1] "770"
# 나이가 80이하인 사람들 데이터만 추출
trips <- trips %>% filter(age<=80)
trips$start_time <- as.POSIXlt(trips$start_time, format="%m/%d/%Y %H:%M:%S")
trips$end_time <- as.POSIXlt(trips$end_time, format="%m/%d/%Y %H:%M:%S")
trips$start_hour <- hour(trips$start_time)
trips$end_hour <- hour(trips$end_time)
trips$mm <- hour(trips$start_time)*60 + minute(trips$start_time)
trips$mm2 <- hour(trips$end_time)*60 + minute(trips$end_time)
trips$start_day <- wday(trips$start_time,label = T, abbr = F, week_start = 1)
# Weekend/Weekday
trips$start_day_type <- ifelse(wday(trips$start_time, week_start = 1)>5, "Weekend", "Weekday")
# Week of year
trips$week <- week(trips$start_time)
# Month (1-12)
trips$month <- month(trips$start_time,label = T,abbr = F)
# Month (January-December)
trips$month_text <- month(trips$start_time,label = T,abbr = F)
# Remove unused levels from factor
trips$month_text <- droplevels(trips$month_text)
ggplot(data=trips, aes(x=week, fill= month_text)) +
geom_histogram(alpha=.9) + theme_fivethirtyeight() + ggtitle("Ride Frequency by Month of Year") +
facet_grid(usertype~age_bin) + scale_fill_viridis_d()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=trips, aes(x=week, fill= age_bin)) +
geom_histogram(alpha=.9,aes(y=..density..)) + theme_fivethirtyeight() + ggtitle("Ride Distribution by Week of Year") +
geom_density(alpha=0,color=rgb(1,0,0,.4)) +
facet_grid(usertype~age_bin) + scale_fill_viridis_d()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
trips <- na.omit(trips)
summary(trips$minutes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 6.333 10.183 12.308 16.000 1410.400
ggplot(trips %>% droplevels(),
aes(x=minutes, fill= age_bin)) + xlim(c(0,50))+
geom_density(alpha=.4) +
labs(fill="",title="Trip duration (minutes)") +
theme_fivethirtyeight() + facet_grid(usertype~start_day_type) +
scale_fill_viridis_d(option="A") +
theme(strip.background = element_rect(fill = "#FFFFFF"))
# Start Times Density Plot
ggplot(trips %>% droplevels(),
aes(x=mm, fill= age_bin)) +
geom_density(alpha=.6) +
scale_x_continuous(labels = c("5am","8am","1:30pm","5pm","8pm"),
breaks = c(300,480,750,1020,1200)) +
labs(fill="",title="2017 Q3-Q4, Start Times") +
theme_fivethirtyeight() + facet_grid(gender~start_day_type) +
scale_fill_viridis_d(option="C") +
theme(strip.background = element_rect(fill = "#FFFFFF"))
# End Times Density Plot
ggplot(trips %>% droplevels(),
aes(x=mm2, fill= age_bin)) +
geom_density(alpha=.6) +
scale_x_continuous(labels = c("5am","8am","1:30pm","5pm","8pm"),
breaks = c(300,480,750,1020,1200)) +
labs(fill="",title="2017 Q3-Q4, End Times") +
theme_fivethirtyeight() + facet_grid(gender~start_day_type) +
scale_fill_viridis_d(option="B") +
theme(strip.background = element_rect(fill = "#FFFFFF"))